import numpy as np
import pandas as pd
from datetime import datetime # To access datetime
from pandas import Series # To work on series
import warnings # To ignore the warnings
warnings.filterwarnings("ignore") #
from openpyxl import load_workbook #to load excel file into program
import glob # helps in dealing with computer paths
import os #to deal with computer operating system
import statsmodels.api as sm
%matplotlib inline
#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
#from matplotlib import plt
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
from plotly import tools
import plotly.tools as tls
sns.set_style('whitegrid')
%matplotlib inline
df = pd.read_csv('Monthly_data_cmo.csv')
MSP_df = pd.read_csv('CMO_MSP_Mandi_1.csv')
df.head(5)
MSP_df.head()
MSP_df.dtypes
#MSP_df["msprice"]=pd.to_numeric(MSP_df["msprice"])
First we make all the values in the dataset Upper Case to remove any case related ambiguity
print(len(df['Commodity'].unique()))
df = df.apply(lambda x: x.astype(str).str.upper())
MSP_df = MSP_df.apply(lambda x: x.astype(str).str.upper())
Removing Any Punctuation and White Saces in the Commodity and APMC
df2=df
df["Commodity"] = df["Commodity"].str.replace('[^\w\s]','')
df['Commodity'] = df['Commodity'].str.strip()
df["APMC"] = df["APMC"].str.replace('[^\w\s]','')
df['APMC'] = df['APMC'].str.strip()
Now we can clearley see the no unique values for both Commodity and APMC has been reduced thus eleminating duplicates and ambigous values
print(len(df['Commodity'].unique()))
Simalarly removing the whitespaces and punctuation in the MSP dataset
MSP_df['commodity'] = MSP_df['commodity'].str.strip()
MSP_df['commodity'] = MSP_df['commodity'].str.replace('[^\w\s]','')
print(len(MSP_df["commodity"].unique()))
Now we can see the number of commodities in the MSP dataset is much less as compared to the Monthly data thus we will be focusing only on the commodties which are present in the MSP dataset
Thus finding the common commodties in both the dataset
common_commodities = list(set(MSP_df.commodity) & set(df2["Commodity"]))
print('The number of common commodities are')
print(len(common_commodities))
common_commodities.sort()
common_commodities
So now we will be focusing on these 19 common commodities
df2=df2.loc[df2["Commodity"] .isin(common_commodities)]
#Removing any NULL values if present
monthly_df = df2.dropna()
Thus now monthly_df is the dataset on which we will be further working which has only the 19 Commodities n whcih we wish to work
monthly_df.head()
monthly_df.dtypes
#the prices are also string type thus
#Converting the prices into numeric type for working on them
monthly_df["modal_price"]=pd.to_numeric(monthly_df["modal_price"])
monthly_df["max_price"]=pd.to_numeric(monthly_df["max_price"])
monthly_df["min_price"]=pd.to_numeric(monthly_df["min_price"])
Now for detecting the outlier we first plot the graphs for each commodity seperately as the prices for all the commodities vary
from pylab import rcParams
rcParams['figure.figsize'] = 25, 15
sns.boxplot(x=monthly_df['Commodity'] , y=monthly_df['modal_price'])
plt.savefig('Box_plot.jpg')
.
For better Visiualization of the outliers plotting each commodity Seperately as the prices of each commodity varies So we iterate over out common_commodities list to plot each of the commodities scatter plot and an box plot for a better visulzation of the outlier in the modal prices of individual commodity
.
rcParams['figure.figsize'] = 10, 5
for i in range(len(common_commodities)):
plt.figure()
plt.title(common_commodities[i])
#plt.subplot(10,10,10)
plt.scatter(
x= monthly_df.loc[monthly_df['Commodity'] == common_commodities[i]].date,
y= monthly_df.loc[monthly_df['Commodity'] == common_commodities[i]].modal_price,
)
for i in range(len(common_commodities)):
plt.figure()
plt.title(common_commodities[i])
#plt.subplot(10,10,10)
sns.boxplot(
#x= monthly_df.loc[monthly_df['Commodity'] == common_commodities[i]].date,
x= monthly_df.loc[monthly_df['Commodity'] == common_commodities[i]].modal_price,
)
#Now for removing the outliers we need to handle each of the commodity seperatly
monthly_df_Bajra = monthly_df.loc[monthly_df['Commodity'].isin(['BAJRI'])]
rcParams['figure.figsize'] = 10, 5
sns.boxplot(x=monthly_df_Bajra['modal_price'])
rcParams['figure.figsize'] = 10, 5
plt.scatter(x=monthly_df_Bajra['date'] , y=monthly_df_Bajra['modal_price'])
plt.title('BAJRA')
plt.xlabel('Date')
plt.ylabel("Modal Prices")
Now for removing outlier we can use the method of Mean and Standard deviation where the value must lie between 3 standarad deviation from the mean we can clearly see this in the Normal Distribution curve but this is valid only when our data is has normal Distribution Although we can see from the graph the dataset of Bajra prices is nearly a normal curve
sns.distplot(monthly_df_Bajra['modal_price'])
monthly_df_Bajra_out_mean = monthly_df_Bajra[~(np.abs(monthly_df_Bajra['modal_price']-monthly_df_Bajra['modal_price'].mean()) > (3*monthly_df_Bajra['modal_price'].std()))]
rcParams['figure.figsize'] = 10, 5
plt.scatter(x=monthly_df_Bajra_out_mean['date'] , y=monthly_df_Bajra_out_mean['modal_price'])
plt.title('BAJRA Outliers Removed Using the mean Method')
plt.xlabel('Date')
plt.ylabel("Modal Prices")
Another we to remove the outlier is to use the Inter Quartile Range(IQR) Method where in this method the data must lie between the lower bound i.e 25 percentile of the dataset and upper bound the 75 percentile in the dataset
For the clear visualzation we see the boxplot plotted above for understanding the quatile range , the middle value os the median
#Calculating the IQR
Q1 = monthly_df_Bajra.modal_price.quantile(0.25)
Q3 = monthly_df_Bajra.modal_price.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
#Now eleiminating the data which is not in the range of the IQR
monthly_df_Bajra_out_IQR = monthly_df_Bajra[~((monthly_df_Bajra.modal_price < (Q1 - 1.5 * IQR)) |(monthly_df_Bajra.modal_price > (Q3 + 1.5 * IQR)))]
plt.scatter(
x= monthly_df_Bajra_out_IQR.date,
y= monthly_df_Bajra_out_IQR.modal_price,
color='orange'
)
plt.title('BAJRA Outliers Removed Using the IQR Method')
plt.xlabel('Date')
plt.ylabel("Modal Prices")
plt.scatter(x=monthly_df_Bajra_out_mean['date'] , y=monthly_df_Bajra_out_mean['modal_price'] ,color='red')
plt.title('BAJRA Outliers Removed Using the mean Method')
plt.xlabel('Date')
plt.ylabel("Modal Prices")
Although both the methods have the same intuition behind their working but we are getting better esults using the IQR Method so we go forward with that method
#Now we can Simply iterate over each commodity and remove the outliers
#Seperating the dataframe based on each commodity
list_df={}
for i in range(len(common_commodities)):
#plt.figure()
#plt.title(common_commodities[i])
list_df[i] = monthly_df.loc[monthly_df['Commodity'].isin([common_commodities[i]])]
#list_df has the values of seperated datframe created
#Calculating the IQR for each dataframe
Q1_list={}
Q3_list={}
IQR_list={}
for i in range(len(common_commodities)):
Q1 = list_df[i].modal_price.quantile(0.25)
Q3 = list_df[i].modal_price.quantile(0.75)
Q1_list[i]=Q1
Q3_list[i]=Q3
IQR = Q3 - Q1
IQR_list[i]=IQR
#print(IQR)
#19 values corresponding to each commodity
IQR_list
list_df_out={}
for i in range(len(common_commodities)):
list_df_out[i] = list_df[i][~((list_df[i].modal_price < (Q1_list[i] - 1.5 * IQR_list[i])) |( list_df[i].modal_price > (Q3_list[i] + 1.5 * IQR_list[i])))]
#list_df_out has all the datasets with outliers removed
#Now concating the all the seprated dataframes into one final dataframe
df_final_out=pd.concat(list_df_out)
df_final_out.head()
rcParams['figure.figsize'] = 10, 5
for i in range(len(common_commodities)):
plt.figure()
plt.scatter(
x= df_final_out.loc[df_final_out['Commodity'] == common_commodities[i]].date,
y= df_final_out.loc[df_final_out['Commodity'] == common_commodities[i]].modal_price,
#ax=axs[i]
#figsize=(40,20)
)
#plt.legend(loc='best',fontsize=20)
plt.title(common_commodities[i] , fontsize=20)
plt.xlabel('Date',fontsize=12)
plt.ylabel('Modal_Prices',fontsize=12)
plt.savefig("OutliersRemoved{y}.png".format(y=common_commodities[i]))
Thus we can clearly see majority of the outliers from our datset have been removed we can now continue with out EDA and complete the further tasks
#Saving the cleaned data in a seperate CSV final
df_final_out.to_csv('Cleaned_APMC_monthly.csv')
monthly_df_out=df_final_out
monthly_df_out.head()
Seasonality detection and treatment is done with the help of an illustrative example:
For interpretability, an analysis is performed on randomly chosen AMPC-Commodity combinations that consist of enough data points to conduct the time series analysis.
Only those groups which have atleast 24 observations are considered for an effective time series analysis.
Thus we have selected the the Commodity Bajri and APMC as Jalgaon
df5_temp = monthly_df_out
df5_temp = df5_temp .loc[df5_temp['Commodity'].isin(['BAJRI']) & df5_temp['APMC'].isin(['JALGAON'])]
#df5 = monthly_df
df5_test = monthly_df_out
df5_test = df5_test.loc[df5_test['Commodity'].isin(['BAJRI']) & df5_test['APMC'].isin(['JALGAON'])]
#df5_2014 = df5_2014.loc[df5_2014['APMC'].isin([''])]
#df5 = pd.DataFrame(df5.groupby(['date']).agg('mean')).reset_index()
df5_test = df5_test[['date', 'modal_price']]
df5_test.head()
df5_test=df5_temp
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
df5_test = pd.read_csv('forecast_APMC_final.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
df5_test = df5_test[['modal_price']]
df5_test
df5_test.index
df5_plot=df5_test
df5_test.plot()
data=np.array(df5_plot.iloc[:,0].values,dtype=float)
print(type(data))
index=df5_plot.index
timeser=pd.Series(data=data,index=index)
df5_plot.dtypes
rcParams['figure.figsize'] = 5, 5
sns.lineplot(x=df5_test.index,y=df5_test['modal_price'],data=df5_test)
plt.title('Bajri price Across the months Jalgaon Mandi',fontsize=20)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Modal Prices',fontsize=15)
plt.savefig(' BajrijalgaonPrices')
Now from the above plot we can see the Fluctation in the Timeseries plot of Wheathusked and thus we need to check whether our data is stationary or not
The Intution behind making the Data Stationary
We make the series stationary to make the variables independent. Variables can be dependent in various ways, but can only be independent in one way. So, we will get more information when they are independent. Hence the time series must be stationary.
If the time series is not stationary, firstly we have to make it stationary. For doing so, we need to remove the trend and seasonality from the data.
We use Dickey Fuller test to check the stationarity of the series.
The intuition behind this test is that it determines how strongly a time series is defined by a trend.
The null hypothesis of the test is that time series is not stationary (has some time-dependent structure).
The alternate hypothesis (rejecting the null hypothesis) is that the time series is stationary.
The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.
#Creating a series Data for ease of visualization
data=np.array(df5_test.iloc[:,0].values,dtype=float)
print(type(data))
index=df5_test.index
timeser=pd.Series(data=data,index=index)
We can also plot the autocorrelation function (ACF) versus lag. That is to say, we plot how correlated each subsequent lag is with the original:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
fig_first = plot_acf(timeser,use_vlines=False)
fig = plt.figure(figsize=(20,10))
ax1 = fig.add_subplot(211)
fig = plot_acf(timeser, lags=10, ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(timeser, lags=10, ax=ax2)
It is clear that the autocorrelation is monotonically decreasing with time. This signifies that, at some time (t+n), there is still some relationship with the previous data points, (t+n-1), (t+n-2), through to initial time (t), with a large autocorrelation up until n=5 lag gaps. Therefore, we conclude that some time-dependent pattern seems to exist within this time series.
from statsmodels.tsa.stattools import adfuller
# Store in a function for later use!
def adf_check(time_series):
"""
Pass in a time series, returns ADF report
"""
result = adfuller(time_series)
print('Augmented Dickey-Fuller Test:')
labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations Used']
for value,label in zip(result,labels):
print(label+' : '+str(value) )
if result[1] <= 0.05:
print("strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary")
else:
print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")
adf_check(df5_test['modal_price'])
Any given time series is thought to consist of three systematic components including level, trend, seasonality, and one non-systematic component called noise. These components are defined as follows:
Level: The average value in the series.
Trend: The increasing or decreasing value in the series.
Seasonality: The repeating short-term cycle in the series.
Noise: The random variation in the series.
A series is thought to be an aggregate or combination of these four components. All series have a level and noise. The trend and seasonality components are optional.
It is helpful to think of the components as combining either additively or multiplicatively.
Additive Model An additive model suggests that the components are added together as follows:
y(t) = Level + Trend + Seasonality + Noise
An additive model is linear where changes over time are consistently made by the same amount.
Multiplicative Model A multiplicative model suggests that the components are multiplied together as follows:
y(t) = Level Trend Seasonality Noise*
A multiplicative model is nonlinear, such as quadratic or exponential. Changes increase or decrease over time.
A nonlinear trend is a curved line.
A non-linear seasonality has an increasing or decreasing frequency and/or amplitude over time.
from pandas import Series
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
#series = Series.from_csv('forecast_APMC.csv', header=0)
#groups =
result_a = seasonal_decompose(timeser, model='additive',freq=3)
result_a.plot()
pyplot.show()
pyplot.savefig('DecompositionAdditive.jpg')
from pandas import Series
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
#series = Series.from_csv('forecast_APMC.csv', header=0)
#groups =
result_m = seasonal_decompose(timeser, model='multiplicative',freq=3)
result_m.plot()
pyplot.show()
pyplot.savefig('DecompositionMultiplicative.jpg')
Now we can clearly see that the above seasonal_decomposition on a frequency of 3 months which very clear from the plot of wheathusked prices that prcies are fluctuating after every 3 months
timeser.plot()
From the Above Decomposition plots it is clearly our datset has Seasonality and a strong trend Line
A seasonal trend cycle repeating itself after 3 months
#for seasonal decomposition we use the diffrencing method
We can difference the dataset manually.
This involves developing a new function that creates a differenced dataset. The function would loop through a provided series and calculate the differenced values at the specified interval or lag.
The simplest approach to determining if there is an aspect of seasonality is to plot and review your data, perhaps at different scales and with the addition of trend lines.
We Apply the difference() function to our seasonal dataset
from math import sin
from math import radians
from matplotlib import pyplot
# create a differenced series
def difference(dataset, interval=1):
diff = list()
for i in range(interval, len(dataset)):
value = dataset[i] - dataset[i - interval]
diff.append(value)
return diff
# invert differenced forecast
def inverse_difference(last_ob, value):
return value + last_ob
# define a dataset with a linear trend
#data = [sin(radians(i)) for i in range(360)] + [sin(radians(i)) for i in range(360)]
pyplot.plot(data)
pyplot.show()
# difference the dataset
diff = difference(data, 3)
pyplot.plot(diff)
pyplot.show()
# invert the difference
inverted = [inverse_difference(data[i], diff[i]) for i in range(len(diff))]
pyplot.plot(inverted)
pyplot.show()
plt.figure(figsize=(16,8))
pyplot.plot(inverted , label = 'Raw Modal price')
pyplot.plot(timeser.values , label = 'Deseasonalised Price')
plt.title("Price Comaprison")
plt.legend(loc='best')
We can clearly see the differencing method is cannot completly remove the seasonality from our data Thus we will go about creating our own seasonal Decompose function which similar to the statsmodel's seasonal_decompose
The decompose() function is defined on the working behind the original seasonal_decompose function
def decompose(df, period=365, lo_frac=0.6, lo_delta=0.01):
"""Create a seasonal-trend (with Loess, aka "STL") decomposition of observed time series data.
This implementation is modeled after the ``statsmodels.tsa.seasonal_decompose`` method
but substitutes a Lowess regression for a convolution in its trend estimation.
This is an additive model, Y[t] = T[t] + S[t] + e[t]
For more details on lo_frac and lo_delta, see:
`statsmodels.nonparametric.smoothers_lowess.lowess()`
Args:
df (pandas.Dataframe): Time series of observed counts. This DataFrame must be continuous (no
gaps or missing data), and include a ``pandas.DatetimeIndex``.
period (int, optional): Most significant periodicity in the observed time series, in units of
1 observation. Ex: to accomodate strong annual periodicity within years of daily
observations, ``period=365``.
Returns:
`statsmodels.tsa.seasonal.DecomposeResult`: An object with DataFrame attributes for the
seasonal, trend, and residual components, as well as the average seasonal cycle.
"""
# use some existing pieces of statsmodels
lowess = sm.nonparametric.lowess
_pandas_wrapper, _ = _maybe_get_pandas_wrapper_freq(df)
# get plain np array
observed = np.asanyarray(df).squeeze()
# calc trend, remove from observation
trend = lowess(observed, [x for x in range(len(observed))],
frac=lo_frac,
delta=lo_delta * len(observed),
return_sorted=False)
detrended = observed - trend
# period must not be larger than size of series to avoid introducing NaNs
period = min(period, len(observed))
# calc one-period seasonality, remove tiled array from detrended
period_averages = np.array([pd_nanmean(detrended[i::period]) for i in range(period)])
# 0-center the period avgs
period_averages -= np.mean(period_averages)
seasonal = np.tile(period_averages, len(observed) // period + 1)[:len(observed)]
resid = detrended - seasonal
deseasonalised = observed - seasonal
# convert the arrays back to appropriate dataframes, stuff them back into
# the statsmodel object
results = list(map(_pandas_wrapper, [seasonal, trend, resid, observed,deseasonalised]))
dr = DecomposeResult(seasonal=results[0],
trend=results[1],
resid=results[2],
observed=results[3],
deseasonalised=results[4],
period_averages=period_averages)
return dr
result_additive=decompose(df5_test,period=3)
result_additive.plot()
plt.show()
plt.savefig("My Decompose Results Add.jpg")
Now we can clearly see the graph is completly similar to the one obtained from the seasonal_decompose function of the stats model
#Now creating a seperate dataset with only the modal prices
#and the deasonalised prices for the particular APMC and Commodity cluster selected
deseasonalised_data=np.array(result_additive[4].values,dtype=float)
#index=result_additive.index
#timeser=pd.Series(data=data,index=index)
df5_deses = pd.read_csv('forecast_APMC_final.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
df5_deses = df5_test[['modal_price']]
df5_deses
df5_deses['deseasonalised_prices']=deseasonalised_data
df5_deses.plot()
plt.legend(loc='best',fontsize=16)
plt.title('Price Comparison of Raw and Deseasonalised Prices' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Comp Raw and Deseasonalised Add.jpg')
residuals_addtive=result_additive[2]
acf_additive=acf(residuals_addtive)
acf_additive
sum(acf_additive)
def decompose_multiplicative(df, period=365):
"""Create a seasonal-trend (with Loess, aka "STL") decomposition of observed time series data.
This implementation is modeled after the ``statsmodels.tsa.seasonal_decompose`` method
but substitutes a Lowess regression for a convolution in its trend estimation.
This is a Multiplicative Model model, Y[t] = T[t] * S[t] * e[t]
For more details on lo_frac and lo_delta, see:
`statsmodels.nonparametric.smoothers_lowess.lowess()`
Args:
df (pandas.Dataframe): Time series of observed counts. This DataFrame must be continuous (no
gaps or missing data), and include a ``pandas.DatetimeIndex``.
period (int, optional): Most significant periodicity in the observed time series, in units of
1 observation. Ex: to accomodate strong annual periodicity within years of daily
observations, ``period=365``.
Returns:
`statsmodels.tsa.seasonal.DecomposeResult`: An object with DataFrame attributes for the
seasonal, trend, and residual components, as well as the average seasonal cycle.
"""
# use some existing pieces of statsmodels
lowess = sm.nonparametric.lowess
_pandas_wrapper, _ = _maybe_get_pandas_wrapper_freq(df)
# get plain np array
observed = np.asanyarray(df).squeeze()
# calc trend, remove from observation
trend = lowess(observed, [x for x in range(len(observed))],
return_sorted=False)
detrended = observed / trend
# period must not be larger than size of series to avoid introducing NaNs
period = min(period, len(observed))
# calc one-period seasonality, remove tiled array from detrended
period_averages = np.array([pd_nanmean(detrended[i::period]) for i in range(period)])
# 0-center the period avgs
period_averages /= np.mean(period_averages)
seasonal = np.tile(period_averages, len(observed) // period + 1)[:len(observed)]
resid = detrended / seasonal
deseasonalised = observed / seasonal
# convert the arrays back to appropriate dataframes, stuff them back into
# the statsmodel object
results = list(map(_pandas_wrapper, [seasonal, trend, resid, observed,deseasonalised]))
dr = DecomposeResult(seasonal=results[0],
trend=results[1],
resid=results[2],
observed=results[3],
deseasonalised=results[4],
period_averages=period_averages)
return dr
result_multiplicative=decompose_multiplicative(df5_test,3)
result_multiplicative.plot()
plt.show()
plt.savefig("My Decompose Results Mult")
result_multiplicative_array=decompose_multiplicative(df5_test,3)
deseasonalised_data_m=np.array(result_multiplicative_array[4].values,dtype=float)
df5_deses_m = pd.read_csv('forecast_APMC8.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
df5_deses_m = df5_test[['modal_price']]
df5_deses_m
df5_deses_m['deseasonalised_prices']=deseasonalised_data_m
df5_deses_m
#Adding the deseasonalised prices coloumn to our dataset
df5_temp['deseasonalised_prices']=deseasonalised_data_m
#Saving the deseasonalised Prices in a Seperate File
df5_temp.to_csv('DeseasonalisedData.csv')
rcParams['figure.figsize'] = 25, 15
df5_deses_m.plot(fontsize=20)
plt.legend(loc='best',fontsize=20)
plt.title('Price Comparison of Raw and Deseasonalised Prices' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Comp Raw and Deseasonalised Mult.jpg')
residuals_multiplicative=result_multiplicative_array[2]
acf_multiplicative=acf(residuals_multiplicative)
acf_multiplicative
sum(acf_multiplicative)
if(sum(acf_multiplicative) < sum(acf_additive)):
print('Multiplicative Model')
else:
print('Additive Model')
def decompose_multiplicative_arr(df, period=365):
# use some existing pieces of statsmodels
lowess = sm.nonparametric.lowess
_pandas_wrapper, _ = _maybe_get_pandas_wrapper_freq(df)
observed = np.asanyarray(df).squeeze()
trend = lowess(observed, [x for x in range(len(observed))],
return_sorted=False)
detrended = observed / trend
period = min(period, len(observed))
# calc one-period seasonality, remove tiled array from detrended
period_averages = np.array([pd_nanmean(detrended[i::period]) for i in range(period)])
# 0-center the period avgs
period_averages /= np.mean(period_averages)
seasonal = np.tile(period_averages, len(observed) // period + 1)[:len(observed)]
resid = detrended / seasonal
deseasonalised = observed / seasonal
results = list(map(_pandas_wrapper, [seasonal, trend, resid, observed,deseasonalised]))
return results
def decompose_additive_arr(df, period=365):
# use some existing pieces of statsmodels
lowess = sm.nonparametric.lowess
_pandas_wrapper, _ = _maybe_get_pandas_wrapper_freq(df)
observed = np.asanyarray(df).squeeze()
trend = lowess(observed, [x for x in range(len(observed))],
return_sorted=False)
detrended = observed - trend
period = min(period, len(observed))
# calc one-period seasonality, remove tiled array from detrended
period_averages = np.array([pd_nanmean(detrended[i::period]) for i in range(period)])
# 0-center the period avgs
period_averages -= np.mean(period_averages)
seasonal = np.tile(period_averages, len(observed) // period + 1)[:len(observed)]
resid = detrended - seasonal
deseasonalised = observed - seasonal
results = list(map(_pandas_wrapper, [seasonal, trend, resid, observed,deseasonalised]))
return results
from statsmodels.tsa.stattools import acf
def checkModel(df):
result_multiplicative_array=decompose_multiplicative_arr(df5_test,3)
result_additive_array=decompose_additive_arr(df5_test,3)
residuals_multiplicative=result_multiplicative_array[2]
acf_multiplicative=acf(residuals_multiplicative)
acf_multiplicative
residuals_addtive=result_additive_array[2]
acf_additive=acf(residuals_addtive)
acf_additive
if(sum(acf_multiplicative) <= sum(acf_additive)):
print('Multiplicative Model')
return 1
else:
print('Additive Model')
return 0
checkModel(df5_test)
Thus from the above check it is very clear we need to use the Multiplicative Model
def deseasonalise_data(df,period):
if(checkModel(df) ==1):
print('Deseasonalising the Data using Multiplicative Model')
result=decompose_multiplicative_arr(df,period)
deseasonalised_data_m=np.array(result[4].values,dtype=float)
df['deseasonalised_prices']=deseasonalised_data_m
else:
print('Deseasonalising the Data using Additive Model')
result=decompose_additive_arr(df,period)
deseasonalised_data_m=np.array(result[4].values,dtype=float)
df['deseasonalised_prices']=deseasonalised_data_m
return df
def deseasonalise_data_mod(df_final_in,period,comm,APMC):
#df5 = monthly_df
df=df_final_in
df = df.loc[ (df['Commodity']==comm) & (df['APMC']==APMC) ]
df = df[['date', 'modal_price']]
#df5_2014
df.to_csv('temp1.csv')
# df5.dtypes # our datatypes for date column is in objects, we shall convert it to datetime
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
df = pd.read_csv('temp1.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
df = df[['modal_price']]
if(checkModel(df) ==1):
print('Deseasonalising the Data using Multiplicative Model')
result=decompose_multiplicative_arr(df,period)
deseasonalised_data_m=np.array(result[4].values,dtype=float)
df['deseasonalised_prices']=deseasonalised_data_m
else:
print('Deseasonalising the Data using Additive Model')
result=decompose_additive_arr(df,period)
deseasonalised_data_m=np.array(result[4].values,dtype=float)
df['deseasonalised_prices']=deseasonalised_data_m
return df
df=df_final_out
df = df.loc[ (df['Commodity']=="BAJRI") & (df['APMC']=="CHOPDA") ]
df = df[['date', 'modal_price']]
#df5_2014
df.to_csv('forecast_APMC_Sample1.csv')
# df5.dtypes # our datatypes for date column is in objects, we shall convert it to datetime
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
df = pd.read_csv('forecast_APMC_Sample1.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
df = df[['modal_price']]
df.index
df_test
df_test=df_final_out
deseasonalise_data_mod(df_test,3,"SORGUMJAWAR","GANGAPUR")
#result=decompose_multiplicative_arr(df_test,3)
MSP_df.dtypes
MSP_df_test=MSP_df
MSP_df_test = MSP_df_test.rename(columns={'commodity': 'Commodity', 'year': 'Year'})
MSP_df_test.head(2)
monthly_df_out_test=monthly_df_out
monthly_df_out_test.head()
merged_df = pd.merge(monthly_df_out_test, MSP_df_test, how='inner', on=['Commodity', 'Year'])
merged_df=merged_df.dropna()
merged_df.dtypes
merged_df.head()
merged_df_BAJRI = merged_df.loc[merged_df['Commodity'].isin(['WHEATHUSKED']) & merged_df['APMC'].isin(['DHULE'])]
ms_price=np.array(merged_df_BAJRI['msprice'],dtype=float)
merged_df_BAJRI_test = merged_df_BAJRI[['date', 'modal_price','msprice']]
#merged_df_BAJRI_test['msprice']=pd.to_numeric(merged_df_BAJRI_test['msprice'])
merged_df_BAJRI_test['ms_price']=ms_price
merged_df_BAJRI_test.head()
merged_df_BAJRI_test = merged_df_BAJRI_test[['date', 'modal_price','ms_price']]
#merged_df_BAJRI_test.drop(merged_df_BAJRI_test['msprice'])
#merged_df_BAJRI_test=merged_df_BAJRI_test.dropna()
merged_df_BAJRI_test.to_csv('merged_BAJRI.csv')
# df5.dtypes # our datatypes for date column is in objects, we shall convert it to datetime
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
merged_df_BAJRI_test = pd.read_csv('merged_BAJRI.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
merged_df_BAJRI_test = merged_df_BAJRI_test[['modal_price','ms_price']]
#merged_df_BAJRI_test = merged_df_BAJRI_test.fillna()
merged_df_BAJRI_test
merged_df_BAJRI_test['deseasonalised_prices']=deseasonalised_data_m
sns.lineplot(x=df5_deses_m.index,y=df5_deses_m['modal_price'],data=merged_df_BAJRI_test , label='Modal Prices(Raw)' )
sns.lineplot(x=df5_deses_m.index,y=merged_df_BAJRI_test['ms_price'],data=merged_df_BAJRI_test, label='MSP price')
sns.lineplot(x=df5_deses_m.index,y=merged_df_BAJRI_test['deseasonalised_prices'],data=merged_df_BAJRI_test, label='Modal Prices(Deseasonalised)')
plt.legend(loc='best',fontsize=20)
plt.title('Price Comparison of Raw and Deseasonalised prcies and MSP price' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Comp Raw Des and MSP.jpg')
#This function takes the merged data frame as the input
#and the name of APMC and Commodity Cluster to get the datset with
#Modal Prices Raw and Deseasonalised , MSP Prices
#the dataframe thus obtained can be plotted to get the required curves
def compareMSP(merged_df,comm,APMC,period):
merge_df_sample=merged_df
merged_df_sample = merged_df.loc[ (merged_df['Commodity']==comm) & (merged_df['APMC']==APMC)]
ms_price=np.array(merged_df_sample['msprice'],dtype=float)
merged_df_sample = merged_df_sample[['date', 'modal_price','msprice']]
merged_df_sample['ms_price']=ms_price
merged_df_sample = merged_df_sample[['date', 'modal_price','ms_price']]
merged_df_sample.to_csv('merged_SAMPLE1.csv')
#df5.dtypes # our datatypes for date column is in objects, we shall convert it to datetime
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
merged_df_sample = pd.read_csv('merged_SAMPLE1.csv', parse_dates=['date'], index_col='date',date_parser=dateparse) # indexed the date column
merged_df_sample = merged_df_sample[['modal_price','ms_price']]
dfdes_temp=deseasonalise_data_mod(merged_df,period,comm,APMC)
dfdes_temp['ms_price']=compare_df['ms_price']
#merged_df_sample_des=merged_df_sample[['modal_price']]
#merged_df_BAJRI_test = merged_df_BAJRI_test.fillna()
#merged_df_BAJRI_test
#merge_df_sample_des=deseasonalise_data_mod(merged_df,3,comm,APMC)
#merge_df_sample_des['ms_price']= ms_price=np.array(merged_df_sample['msprice'],dtype=float)
return dfdes_temp
Fluctutation is the diffrence between the minimum and maximum prices
monthly_df_out['fluctuation'] = monthly_df_out['max_price'] - monthly_df_out['min_price']
monthly_df_out.head()
monthly_df_fluc=monthly_df_out.sort_values(['fluctuation','Year'], ascending=[False,False])
monthly_df_fluc.head(11)
## we can clearly see the Maximum Fluctuation of each Commodity , APMC , Months
monthly_df_fluc_final = pd.DataFrame(monthly_df_fluc)
#Saving the Flagged Fluctuation in a seperate File
monthly_df_fluc_final.to_csv('Flagged Fluctuations.csv')
Running the Above Seasonality Removal on few Examples
The function deseasonalise_data_mod() takes 4 arguments the dataset , time period , Commodity and APMC Cluster we want
the function uses the checkModel( ) function to Check which Model as in Additive or Multiplicative to be Used
The dataframe returned by this function is the the dataframe with the Deseasonalised prices which is our matter of interest
des_data = deseasonalise_data_mod(df_final_out,3,"BAJRI","CHOPDA")
des_data.plot()
plt.title('Price Comparison of Raw and Deseasonalised Prices BAJRI AND CHOPDA' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
plt.legend(loc='best',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Price Comparison Raw and Deseasonalised1.jpg')
des_data2 = deseasonalise_data_mod(df_final_out,3,"RICEPADDYHUS","NAGPUR")
des_data2.plot()
plt.title('Price Comparison of Raw and Deseasonalised Prices "RICEPADDYHUS NAGPUR" ' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
plt.legend(loc='best',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Price Comparison Raw and Deseasonalised2.jpg')
In the Examples below we are using the compareMSP( ) Function for Comparison of Prices the fucntion takes the four arguments the dataframe for comparison the APMC and Commodity names and the period for calculating deseasonalising our Data
temp_data = compareMSP(merged_df,"BAJRI","CHOPDA",3)
temp_data.plot()
plt.title('Price Comparison of Raw and Deseasonalised Prices MSP Prices BAJRI AND CHOPDA' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
plt.legend(loc='best',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Price Comparison Raw and DeseasonalisedSample1.jpg')
temp_data_2 = compareMSP(merged_df,"SORGUMJAWAR","GANGAPUR",3)
temp_data_2.plot()
plt.title('Price Comparison of Raw and Deseasonalised Prices MSP Prices SORGUMJAWAR GANGAPUR' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
plt.legend(loc='best',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Price Comparison Raw and DeseasonalisedSample2.jpg')
temp_data3=compareMSP(merged_df,"BAJRI","JALGAON",3)
temp_data3.plot()
plt.title('Price Comparison of Raw and Deseasonalised Prices MSP Prices BAJRI JALGAON' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
plt.legend(loc='best',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Price Comparison Raw and DeseasonalisedSample3.jpg')
temp_data4=compareMSP(merged_df,"WHEATHUSKED","DHULE",3)
temp_data4.plot()
plt.title('Price Comparison of Raw and Deseasonalised Prices MSP Prices WHEATHUSKED DHULE' , fontsize=25)
plt.xlabel('Date',fontsize=15)
plt.ylabel('Prices',fontsize=15)
plt.legend(loc='best',fontsize=15)
rcParams['figure.figsize'] = 25, 15
#plt.figure(figsize(30,15))
plt.savefig('Price Comparison Raw and DeseasonalisedSample4.jpg')
We have used IQR method for outlier removal for simplfying the process and this process being the most intuitive one
For removing the Seasonality from data and deseasonalising the dataset we need to simple use the functions deserealise_data_mod() pass the relevant cluster of APMC and Commodity
Thus both the above mthod can be used to simply run accross different clusters the examples picked are those APMC and Commodity CLuster whih have enough data for which time series plot can be obtained and further process can take place
A very interesting and challenging task with a lot of learning involved
I was completely new to concept of Time series , thus learing about Time Series then implementing the project within a period of 4 days a very challenging and thrilling experience
The most interesting part was the outlier removal task and then Seasonality removal and deseasonalising the data , where implmenting the seasonal_decompose on my own thus having a pretty good idea of how the seasonal decompose function actually works